First, we get the dimension of the dataset.
XX <- read.csv("july2017.csv", header = T)
X <- XX[,-1] #first columne is the name
rownames(X) <- XX[,1]
dim(X)## [1] 3353 313
There are 0.1340624 NA values in total.
Investigate any structure of missing observations.
library(mice)## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
pattern <- md.pattern(X)
pattern <- pattern[,-ncol(pattern)] #delete the last chr of names which is "", no use at all
pattern_last <- pattern[nrow(pattern),]
pattern_last_df <- data.frame(variable_name=names(pattern_last),num_missing=pattern_last)
DT::datatable(pattern_last_df,rownames = FALSE)Omit the variables that consist of more than 2184 NAs and samples contain more than 50 NAs.
names <- names(which(pattern_last>2184))
X.1 <- X[,!(colnames(X) %in% names)]
rs <- apply(X.1,1,function(y) sum(is.na(y)))
X.2 <- X.1[-which(rs>50),]
row_name <- XX[,1]
row_name <- row_name[-which(rs>50)]There are 0.0231252 NA values in total now.
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# fill in NA value & determine factor variables
# if X.2[,i] has less than 6 unique variables, then we regard it as a factor
lu <- apply(X.2,2,function(y) length(unique(y,na.rm = TRUE)))
for(i in which(lu<6 | lu==6)) {
X.2[is.na(X.2[,i]),i] <-getmode(X.2[,i])
X.2[,i] <- factor(X.2[,i])
}
# fill in NA value & determine numerical variables
for(i in which(lu>6)){
X.2[,i] <- as.numeric(X.2[,i])
temp_median <- median(X.2[,i], na.rm = TRUE)
X.2[is.na(X.2[,i]),i] <- temp_median
}
#delete variables that is constant through all the samples
X.3=X.2[,!(colnames(X.2) %in% names(which(lu==1)))]
rownames(X.3) <- row_name
X.3 <- X.3[,c(146,1:145,147:270)]
DT::datatable(X.3)## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
Choose mi: Heart attack (self-reported history) as the output, other variables except
as the input.
rm(list=ls())
load("./normalized.Rdata")
library(glmnet)## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded glmnet 2.0-13
names.wanted <- c("mi","st","ht","db","CVD")
y=data[,"mi"]
x=data[,!(colnames(data) %in% names.wanted)]
#hotcode
xd <- model.matrix(~.,x)[,-1] #delete the first columne which is the intercept, since glmnet will introduce an interception by defaultThe distribution of output:
set.seed(1)
idx<-sample(nrow(xd),1/10*nrow(xd))
trainx<-xd[-idx,]
testx<-xd[idx,]
trainy<-y[-idx]
testy<-y[idx]
rm(xd)
cat("distribution of training output:")## distribution of training output:
summary(trainy)## 1 2
## 110 2839
cat("distribution of test output:")## distribution of test output:
summary(testy)## 1 2
## 14 313
One hot encoding of input. Take a look at the test input:
DT::datatable(testx)Dimension of training input:(2949, 339); dimension of test input:(327, 339); length of training output:(2949); length of test output:(327).
\(L=-\frac{1}{n}loglik+\lambda||\beta||_1\)
fit = glmnet(trainx, trainy, family="binomial", nlambda=50, alpha=1)
plot(fit, xvar="lambda", label=TRUE)The y-axis is the average of 10 dev on the leave out data.
\(dev=ResDev/N\), where \(ResDev=-2*loglik\).
lbdl <- matrix(0,nrow=4,ncol=6)
lbdl <- data.frame(lbdl)
rownames(lbdl)=c("CV_deviance","CV_misclassificaiton_rate","BIC","EBIC"); colnames(lbdl) <- c("lambda","num_non_zero","CCR_train","HUM_train","CCR_test","HUM_test")
library(doParallel)## Warning: package 'doParallel' was built under R version 3.4.3
## Loading required package: iterators
## Loading required package: parallel
cl <- makeCluster(3)
registerDoParallel(cl)
cvfit = cv.glmnet(trainx, trainy, family = "binomial", type.measure = "deviance",alpha=1,nfolds=10,parallel=T,standardize = T)
stopImplicitCluster()
plot(cvfit)The best lambda is 0.0040135.
lbdl[1,1] <- cvfit$lambda.min
cosp <- coef(cvfit, s=c(cvfit$lambda.min))
co_l_cv_mce <- as.matrix(cosp)[(1:cosp@Dim[1]) %in% summary(cosp)[,"i"],]
lbdl[1,2] <- length(co_l_cv_mce)
co_l_cv_mce## (Intercept) gender2 agegp3 agegp23
## -5.665137e+00 7.728798e-01 -9.014958e-02 -2.553986e-05
## agegp25 bppul_f anti_chol1 drugs_others1
## -7.116925e-01 5.036103e-03 -3.381137e-01 -7.593035e-01
## hypertension1 hypertension32 hyperlipidaemia11 blood_data1
## -6.863052e-01 -1.857029e-01 -7.158593e-01 5.628131e-01
## bldglu chol GFR_EPI better_eye_bvaR
## -1.759280e-02 5.744405e-01 1.191050e-02 1.766079e-01
## pvalogr_USA2 bvalogr_USA2 bvalogl_USA3 bvalogl_WHO2
## -9.518539e-02 -1.007607e-01 1.434309e-01 -3.206571e-01
## undercorr_L1 bva_bi_USA4 srepsphl srepcylr
## -1.453349e-02 -7.170429e-02 4.296226e-03 1.351580e-01
## srepcyll anisometropia1 RE_type_pva2 highmyopia_pva1
## -1.033051e-01 -2.285010e-01 1.782176e-01 -1.547580e-01
## emmetropia_pva1 lenstatr4 rel_cat4 job_cat32
## 1.086332e-05 -2.985753e-01 1.910830e-02 -3.403103e-02
## home4 smkyn2 smk_cat3 ang2
## -6.574953e-02 5.436503e-01 -1.613196e-01 2.266561e+00
## htchol2 avccr IVAN_CRAE IVAN_AVR
## 7.751236e-02 1.966593e-01 2.058212e-03 5.257568e-01
## L_AMDgradable1 any_AMDgradable1 pigment_R1 R_early_amd1
## -5.697015e-01 2.240904e+00 -1.847695e-01 -2.628506e-02
## geoatrophy_L1 L_late_amd_dry1 any_late_amd_dry1 any_late_amd1
## -6.583244e-01 1.363367e-02 4.012796e-02 8.283072e-01
## amdtype_R1 amdtype_any2 any_DRgradable1 R_retino_cat2
## -1.670538e-02 1.205952e-04 1.324717e-02 -1.047815e+00
## R_VT_retino1 L_NDR_cat2 R_CSME1 CSME_any1
## -3.293194e-02 -1.975072e-01 -6.056883e-01 -6.042936e-01
## sec_glau_R1 POAG_L1 CataractGrading1 nuclear_L1
## -2.588122e-01 -4.180975e-01 -6.387850e-01 -2.402566e-02
## rcrtgp2_R3 rpscar2_R rpscgp2_R2
## 1.695987e-01 -1.353650e-02 -2.393507e-01
There are 63 no-zero variables.
library(mcca)
pvtrain1 <- predict(cvfit,newx=trainx,type="response",s=c(cvfit$lambda.min))
lbdl[1,3] <- ccr(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
lbdl[1,4] <- hum(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
cat("Correct classification rate of traing data:",lbdl[1,3])## Correct classification rate of traing data: 0.9677857
cat("Area under curve of traing data:",lbdl[1,4])## Area under curve of traing data: 0.9367191
pvtest1 <- predict(cvfit,newx=testx,type="response",s=c(cvfit$lambda.min))
lbdl[1,5] <- ccr(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
lbdl[1,6] <- hum(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
cat("Correct classification rate of test data:",lbdl[1,5])## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbdl[1,6])## Area under curve of test data: 0.8455043
cl <- makeCluster(3)
registerDoParallel(cl)
cvfit = cv.glmnet(trainx, trainy, family = "binomial", type.measure = "class",alpha=1,nfolds=10,parallel=T,standardize = T)
stopImplicitCluster()
plot(cvfit)The best lambda is 0.0111678.
lbdl[2,1] <- cvfit$lambda.min
cosp <- coef(cvfit, s=c(cvfit$lambda.min))
co_l_cv_mce <- as.matrix(cosp)[(1:cosp@Dim[1]) %in% summary(cosp)[,"i"],]
lbdl[2,2] <- length(co_l_cv_mce)
co_l_cv_mce## (Intercept) gender2 agegp25 anti_ht1 anti_chol1
## -0.87184018 0.32394937 -0.23154342 -0.19487611 -0.97372931
## drugs_others1 hypertension1 chol GFR_EPI bvalogr_USA2
## -0.48861967 -0.07457401 0.35259987 0.01144681 -0.16002465
## smkyn2 smk_cat3 ang2 R_retino_cat2
## 0.32563476 -0.19493706 1.93992455 -0.47197565
There are 14 no-zero variables.
library(mcca)
pvtrain1 <- predict(cvfit,newx=trainx,type="response",s=c(cvfit$lambda.min))
lbdl[2,3] <- ccr(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
lbdl[2,4] <- hum(trainy,data.frame(1-pvtrain1,pvtrain1),"prob",2)
cat("Correct classification rate of traing data:",lbdl[2,3])## Correct classification rate of traing data: 0.9637165
cat("Area under curve of traing data:",lbdl[2,4])## Area under curve of traing data: 0.9105655
pvtest1 <- predict(cvfit,newx=testx,type="response",s=c(cvfit$lambda.min))
lbdl[2,5] <- ccr(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
lbdl[2,6] <- hum(testy,data.frame(1-pvtest1,pvtest1),"prob",2)
cat("Correct classification rate of test data:",lbdl[2,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbdl[2,6])## Area under curve of test data: 0.8523505
\(BIC=-2loglik+df*log(n)\)
library(grpreg)## Warning: package 'grpreg' was built under R version 3.4.3
fit<- grpreg(X=trainx, y=trainy, penalty="grLasso",family = "binomial",group=1:ncol(trainx))## [1] "Logistic regression modeling Pr(y=2)"
## Warning: 关闭不再使用的链结7(<-Gao-PC:11314)
## Warning: 关闭不再使用的链结6(<-Gao-PC:11314)
## Warning: 关闭不再使用的链结5(<-Gao-PC:11314)
Lambda <- fit$lambda
xlim <- range(Lambda)
plot(Lambda,select(fit,crit="BIC")$IC,xlim=xlim,pch=19,type="o",ylab="BIC",col="red")
lbdl[3,1] <- select(fit,criterion = "BIC")$lambda
abline(v=lbdl[3,1],lwd=3)The best lambda is 0.0101757.
cosp <- select(fit,criterion = "BIC")$beta
cosp[which(cosp!=0)]## (Intercept) gender2 agegp25 anti_ht1 anti_chol1
## -0.89576230 0.37922805 -0.28631894 -0.18955312 -0.98535431
## drugs_others1 hypertension1 chol GFR_EPI bvalogr_USA2
## -0.52309536 -0.13673843 0.36921762 0.01122648 -0.18884284
## srepcylr smkyn2 smk_cat3 ang2 R_retino_cat2
## 0.01703296 0.34831108 -0.19912430 1.97564511 -0.56212888
lbdl[3,2] <- length(cosp[which(cosp!=0)])There are 15 non-zero vbs.
library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbdl[3,1])
pre <- data.matrix(pre)
lbdl[3,3] <- ccr(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[3,4] <- hum(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)
pre <- predict(fit, X=testx, type = "response",lambda = lbdl[3,1])
pre <- data.matrix(pre)
lbdl[3,5] <- ccr(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[3,6] <- hum(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbdl[3,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbdl[3,6])## Area under curve of test data: 0.8505249
\(EBIC=-2loglik+df*log(n)+df*2*0.5*log(P)\)
library(grpreg)
Lambda <- fit$lambda
xlim <- range(Lambda)
bic <- select(fit,crit="BIC",df.method="active")$IC
ebic <- bic+2*0.5*log(dim(trainx)[2])*predict(fit,type="nvars")
plot(Lambda,ebic,xlim=xlim,pch=19,type="o",ylab="EBIC",col="red")
lbdl[4,1] <- Lambda[which(ebic==min(ebic,na.rm=T))]
abline(v=lbdl[4,1],lwd=3)The best lambda is 0.0162026.
cosp <- predict(fit,type="coefficients",lambda = lbdl[4,1])
cosp[which(cosp!=0)]## [1] -0.47821245 0.08003702 -0.10860548 -0.91744884 -0.31589203
## [6] 0.27067687 0.01188295 0.21152310 -0.15449982 1.77293117
lbdl[4,2] <- length(cosp[which(cosp!=0)])There are 10 non-zero vbs.
library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbdl[4,1])
pre <- data.matrix(pre)
lbdl[4,3] <- ccr(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[4,4] <- hum(y=as.numeric(trainy),d=data.frame(1-pre,pre),method="prob",k=2)
pre <- predict(fit, X=testx, type = "response",lambda = lbdl[4,1])
pre <- data.matrix(pre)
lbdl[4,5] <- ccr(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
lbdl[4,6] <- hum(y=as.numeric(testy),d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbdl[4,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbdl[4,6])## Area under curve of test data: 0.846189
vb <- c("trainx","trainy","testx","testy","lbdl")
rm(list=ls()[!ls() %in% vb])
testy=as.numeric(testy)-1
trainy=as.numeric(trainy)-1load("./index.Rdata")
index <- index[-1]
index## [1] 1 2 3 4 4 4 5 5 5 5 6 7 8 9 10 11 12
## [18] 13 14 15 15 15 16 17 18 19 20 21 22 23 24 24 24 25
## [35] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## [52] 43 43 44 44 45 45 46 46 47 47 48 48 49 49 50 50 51
## [69] 51 52 53 54 55 55 55 55 55 56 56 56 56 56 57 57 57
## [86] 57 57 58 58 58 58 58 59 60 61 62 63 64 65 66 67 68
## [103] 69 69 70 70 71 71 72 73 74 75 76 77 78 79 80 81 82
## [120] 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 97 97
## [137] 98 98 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## [154] 113 114 115 116 116 117 117 118 119 120 121 122 123 124 125 126 126
## [171] 126 127 127 127 128 128 128 128 129 130 131 132 132 132 132 133 133
## [188] 133 134 135 136 137 138 138 139 140 141 142 143 144 145 146 147 148
## [205] 149 150 151 151 151 152 153 154 155 156 157 158 159 160 161 162 163
## [222] 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [239] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 196
## [256] 197 197 198 198 199 200 201 202 203 204 205 205 205 205 206 207 208
## [273] 209 210 211 212 213 214 214 214 214 215 215 215 215 216 217 218 219
## [290] 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
## [307] 237 238 239 240 241 242 243 244 244 245 246 247 247 248 249 250 251
## [324] 251 252 253 254 254 255 256 257 258 259 260 261 262 263 264 265
\(L=-\frac{1}{n}loglik+\lambda\sum_j ||\beta_j||_2\)
library(grpreg)
fit <- grpreg(X=trainx, y=trainy, group=index, penalty="grLasso",family = "binomial")
plot(fit,label=T,log.l=T)lbd <- matrix(0,nrow=4,ncol=6)
lbd <- data.frame(lbd)
rownames(lbd)=c("CV_deviance","CV_misclassificaiton_rate","BIC","EBIC"); colnames(lbd) <- c("lambda","num_non_zero","CCR_train","HUM_train","CCR_test","HUM_test")
cvfit <- cv.grpreg(X=trainx, y=trainy, group=index, penalty="grLasso",family = "binomial",nfolds=10,seed=1,trace=F)
lbd[1,1]<- cvfit$lambda.min
plot(cvfit)The best lambda is 0.0063906.
cosp <- coef(cvfit,lambda=cvfit$lambda.min)
cosp[which(cosp!=0)]## (Intercept) gender2 age agegp22
## -2.132646e+00 5.474386e-01 -5.195624e-03 7.122076e-04
## agegp23 agegp24 agegp25 anti_ht1
## -2.038723e-03 7.605449e-04 -1.418967e-02 -1.407851e-01
## anti_chol1 drugs_others1 hypertension1 hyperlipidaemia11
## -9.161848e-01 -6.972990e-01 -4.264602e-01 -9.828030e-02
## bldglu chol GFR_EPI better_eye_bvaR
## -3.143733e-03 4.539643e-01 1.113910e-02 1.815634e-01
## srepcylr anisometropia1 emmetropia_pva1 smkyn2
## 9.564015e-02 -1.370607e-01 3.040640e-02 5.979631e-01
## ang2 IVAN_CRAE L_AMDgradable1 any_AMDgradable1
## 2.148121e+00 1.671462e-05 -1.109394e-01 1.218256e+00
## pigment_R1 any_late_amd1 any_DRgradable1 R_retino1
## -5.204764e-02 1.748310e-02 4.425398e-03 -1.882591e-02
## R_retino_cat1 R_retino_cat2 R_retino_cat3 R_retino_cat5
## 3.806642e-02 -3.420575e-01 -1.273258e-01 -1.635117e-01
## retino_cat_worse R_CSME1 CSME_any1 POAG_L1
## -9.113397e-03 -2.257232e-01 -3.576518e-01 -9.945183e-02
## rpscar2_R
## -1.046879e-02
lbd[1,2] <- length(cosp[which(cosp!=0)])There are 37 non-zero vbs.
library(mcca)
pre <- predict(cvfit, X=trainx, type = "response",lambda = lbd[1,1])
pre <- data.matrix(pre)
lbd[1,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[1,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
pre <- predict(cvfit, X=testx, type = "response",lambda = lbd[1,1])
pre <- data.matrix(pre)
lbd[1,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[1,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[1,5])## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbd[1,6])## Area under curve of test data: 0.8386581
plot(cvfit,type="pred",vertical.line = F)
cvfit_sum <- summary(cvfit)
lbd[2,1] <- cvfit_sum$lambda[which(cvfit_sum$pe ==min(cvfit_sum$pe))]
abline(v=log(lbd[2,1]),lwd=3)The best lambda is 0.008448.
cosp <- coef(cvfit,lambda=lbd[2,1])
lbd[2,2] <- length(cosp[which(cosp!=0)])
cosp[which(cosp!=0)]## (Intercept) gender2 age anti_ht1
## -1.262383e+00 4.273564e-01 -5.085371e-03 -1.732883e-01
## anti_chol1 drugs_others1 hypertension1 chol
## -9.880625e-01 -6.096063e-01 -2.466116e-01 4.058881e-01
## GFR_EPI better_eye_bvaR srepcylr anisometropia1
## 1.127762e-02 1.089449e-01 6.633619e-02 -3.917945e-02
## smkyn2 ang2 any_AMDgradable1 any_DRgradable1
## 5.399146e-01 2.054450e+00 3.531465e-01 7.914514e-07
## retino_cat_worse CSME_any1 rpscar2_R
## -4.175693e-02 -1.699728e-01 -4.661313e-03
There are 19 non-zero vbs.
library(mcca)
pre <- predict(cvfit, X=trainx, type = "response",lambda = lbd[2,1])
pre <- data.matrix(pre)
lbd[2,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[2,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
pre <- predict(cvfit, X=testx, type = "response",lambda = lbd[2,1])
pre <- data.matrix(pre)
lbd[2,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[2,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[2,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbd[2,6])## Area under curve of test data: 0.8322684
Lambda <- fit$lambda
xlim <- range(Lambda)
plot(Lambda,select(fit,crit="BIC",df.method="default")$IC,xlim=xlim,pch=19,type="o",ylab="BIC",col="red")
lbd[3,1] <- select(fit,criterion = "BIC")$lambda
abline(v=lbd[3,1],lwd=3)The best lambda is 0.0092717.
cosp <- select(fit,criterion = "BIC")$beta
cosp[which(cosp!=0)]## (Intercept) gender2 age anti_ht1
## -0.866920415 0.384591621 -0.005063179 -0.182959437
## anti_chol1 drugs_others1 hypertension1 chol
## -0.980664378 -0.575060465 -0.188601266 0.390773848
## GFR_EPI better_eye_bvaR srepcylr anisometropia1
## 0.011383520 0.076532787 0.051032856 -0.001409315
## smkyn2 ang2 retino_cat_worse CSME_any1
## 0.516494523 2.019505417 -0.034955107 -0.031755151
## rpscar2_R
## -0.001904645
lbd[3,2] <- length(cosp[which(cosp!=0)])There are 17 non-zero vbs.
library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbd[3,1])
pre <- data.matrix(pre)
lbd[3,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[3,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
pre <- predict(cvfit, X=testx, type = "response",lambda = lbd[3,1])
pre <- data.matrix(pre)
lbd[3,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[3,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[3,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbd[3,6])## Area under curve of test data: 0.8343222
library(grpreg)
Lambda <- fit$lambda
xlim <- range(Lambda)
bic <- select(fit,crit="BIC",df.method="active")$IC
ebic <- bic+2*0.5*log(dim(trainx)[2])*predict(fit,type="nvars")
plot(Lambda,ebic,xlim=xlim,pch=19,type="o",ylab="EBIC",col="red")
lbd[4,1] <- Lambda[which(ebic==min(ebic,na.rm=T))]
abline(v=lbd[4,1],lwd=3)The best lambda is 0.0147632.
cosp <- predict(fit,type="coefficients",lambda = lbd[4,1])
cosp[which(cosp!=0)]## [1] -0.74786667 0.13265368 -0.15407222 -0.93135420 -0.37203412 0.29529415
## [7] 0.01227613 0.34899345 1.81360064
lbd[4,2] <- length(cosp[which(cosp!=0)])There are 9 non-zero vbs.
library(mcca)
pre <- predict(fit, X=trainx, type = "response",lambda = lbd[4,1])
pre <- data.matrix(pre)
lbd[4,3] <- ccr(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[4,4] <- hum(y=trainy+1,d=data.frame(1-pre,pre),method="prob",k=2)
pre <- predict(fit, X=testx, type = "response",lambda = lbd[4,1])
pre <- data.matrix(pre)
lbd[4,5] <- ccr(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
lbd[4,6] <- hum(y=testy+1,d=data.frame(1-pre,pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbd[4,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbd[4,6])## Area under curve of test data: 0.8409402
vb <- c("trainx","trainy","testx","testy","lbdl","lbd","index")
rm(list=ls()[!ls() %in% vb])
trainy <- factor(trainy)
testy <- factor(testy)
library(msgl)## Warning: package 'msgl' was built under R version 3.4.3
## Loading required package: sglOptim
## Warning: package 'sglOptim' was built under R version 3.4.3
\(-loglik+\lambda[(1-\alpha)\sum_{j=1}^J r_j*||\beta_j||_2+\alpha||\beta||_1]\)
Where \(r_j=(2*\#\{\beta_j\})^\frac{1}{2}\)
l <- lambda(x=trainx, classes=trainy,d=200,lambda.min=0.001, intercept = TRUE,standardize = TRUE,grouping =index)
fit <- msgl::fit(x=trainx, classes=trainy, alpha = 0.5, lambda = l,grouping = index,
standardize = TRUE,d = 200, intercept = TRUE)##
## Running msgl (dense design matrix)
##
## Samples: Features: Classes: Groups: Parameters:
## 2.949k 340 2 266 680
be <- fit$beta
df <- matrix(0,340,200)
for (i in 1:200){
tmp <- be[[i]]
df[,i] <- as.matrix(tmp)[1,]
}
cl <- rainbow(340)
plot(0,0,xlim = c(0,0.05),ylim = c(-2.5,2),type = "n",xlab="lambda",ylab="coef")
for (i in 2:340){
lines(fit$lambda,df[i,],col = cl[i],type = 'l')
}lbds <- matrix(0,nrow=4,ncol=6)
lbds <- data.frame(lbds)
rownames(lbds)=c("CV_deviance","CV_misclassificaiton_rate","BIC","EBIC"); colnames(lbds) <- c("lambda","num_non_zero","CCR_train","HUM_train","CCR_test","HUM_test")
library(doParallel)
cl <- makeCluster(3)
registerDoParallel(cl)
fit.cv <- cv(x=trainx, classes=trainy, fold = 10, alpha = 0.5, lambda = l,
use_parallel = T,grouping =index,standardize = TRUE, intercept = TRUE)## Warning in for (i in 1L:d2) {: 关闭不再使用的链结10(<-Gao-PC:11314)
## Warning in for (i in 1L:d2) {: 关闭不再使用的链结9(<-Gao-PC:11314)
## Warning in for (i in 1L:d2) {: 关闭不再使用的链结8(<-Gao-PC:11314)
## Running msgl 10 fold cross validation (dense design matrix)
##
## Samples: Features: Classes: Groups: Parameters:
## 2.949k 340 2 266 680
id <- which(Err(fit.cv, type = "loglike")==min(Err(fit.cv, type = "loglike")))[1]
lbds[1,1]<- l[id]
Lambda <- l
xlim <- range(Lambda)
plot(Lambda,Err(fit.cv, type = "loglike"),xlim=xlim,pch=19,type="o",ylab="CV_dev",col="red")
abline(v=lbds[1,1],lwd=3)The best lambda is 0.0034035.
as.matrix(coef(fit, id))[1,]## Intercept gender2 agegp22
## 1.545380e+00 -4.085206e-01 -2.382046e-02
## agegp23 agegp24 agegp25
## 4.671626e-02 -1.986130e-02 3.255235e-01
## bppul_f anti_chol1 drugs_others1
## -3.421861e-03 1.448584e-01 3.964860e-01
## drugs_unknown1 hypertension1 hypertension32
## 3.529373e-03 4.571243e-01 1.316352e-02
## hyperlipidaemia11 blood_data1 bldglu
## 4.123419e-01 -3.513039e-01 1.066707e-02
## chol GFR_EPI CKD_EPI1
## -3.004650e-01 -7.056104e-03 -4.519768e-02
## better_eye_bvaR pvalogr_USA2 bvalogr_USA2
## -1.066048e-01 1.702083e-02 5.505719e-02
## bvalogl_WHO2 bvalogl_WHO3 undercorr_L1
## 1.500131e-01 -7.748809e-02 3.826679e-02
## srepsphl srepcylr srepcyll
## -1.483967e-03 -8.270729e-02 8.541625e-02
## anisometropia1 highhyperopiar1 emmetropiar1
## 1.323592e-01 -1.097117e-02 -1.106608e-02
## highmyopia_pva1 highhyperopia_pva1 emmetropia_pva1
## 1.355630e-01 -2.649955e-02 -9.790805e-02
## opp_R lenstatr2 lenstatr4
## -4.782955e-03 2.884901e-06 1.743796e-04
## cocataract_L1 race2 job_cat32
## 1.254231e-02 1.039095e-02 4.983708e-02
## home3 home4 smkyn2
## -4.400031e-04 1.006029e-03 -3.216387e-01
## smk_cat3 catyn2 ang2
## 3.030527e-02 1.083345e-02 -1.149143e+00
## htchol2 avccr IVAN_CRAE
## -3.524412e-02 -1.520509e-01 -1.781839e-03
## IVAN_AVR L_AMDgradable1 any_AMDgradable1
## -3.532818e-01 3.848282e-01 -1.228562e+00
## pigment_R1 R_early_amd1 geoatrophy_L1
## 1.101177e-01 5.925148e-02 4.938554e-01
## L_late_amd_dry1 any_late_amd1 any_DRgradable1
## -3.511918e-01 -5.395305e-01 -2.981747e-03
## R_retino_cat1 R_retino_cat2 R_retino_cat3
## -1.667822e-02 4.783728e-01 6.948303e-02
## R_retino_cat5 R_CSME1 CSME_any1
## 9.901070e-02 3.461595e-01 2.474139e-01
## sec_glau_R1 pri_ACD_R1 POAG_L1
## 1.789268e-01 -6.339169e-02 2.455129e-01
## PACG_any1 CataractGrading1 nuclear_L1
## -4.638681e-04 4.203605e-01 4.042717e-02
## rcrtgp2_R2 rcrtgp2_R3 rpscar2_R
## 3.160647e-03 -9.370072e-02 5.863833e-03
## rpscgp2_R2 any_psc1
## 9.028384e-02 2.116627e-02
lbds[1,2] <- length(as.matrix(coef(fit, id))[1,])There are 74 non-zero vbs.
library(mcca)
res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[1,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[1,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)
res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[1,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[1,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[1,5])## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbds[1,6])## Area under curve of test data: 0.8404838
id <- which(Err(fit.cv, type = "rate")==min(Err(fit.cv, type = "rate")))[1]
lbds[2,1]<- l[id]
plot(Lambda,Err(fit.cv, type = "rate"),xlim=xlim,pch=19,type="o",ylab="CV_dev",col="red")
abline(v=lbds[2,1],lwd=3)The best lambda is 0.005706.
as.matrix(coef(fit, id))[1,]## Intercept gender2 agegp22 agegp23
## -0.7492293199 -0.3089418187 -0.0125955592 0.0196953985
## agegp25 bppul_f anti_ht1 anti_chol1
## 0.1693088481 -0.0003835190 0.0584737902 0.4540023401
## drugs_others1 hypertension1 hyperlipidaemia11 blood_data1
## 0.3573195358 0.2596668727 0.0749868898 -0.0614854941
## bldglu chol GFR_EPI better_eye_bvaR
## 0.0030411046 -0.2351715220 -0.0056398891 -0.0834264798
## bvalogr_USA2 srepcylr anisometropia1 emmetropia_pva1
## 0.0422656882 -0.0489389904 0.0834752344 -0.0386819443
## smkyn2 smk_cat3 ang2 IVAN_CRAE
## -0.2836284095 0.0350365447 -1.0838605311 -0.0003267495
## L_AMDgradable1 any_AMDgradable1 pigment_R1 any_late_amd1
## 0.1225242418 -0.7324240117 0.0470319630 -0.0854735414
## any_DRgradable1 R_retino_cat2 R_retino_cat3 R_retino_cat5
## -0.0030994879 0.3315163995 0.0595516881 0.0847097416
## R_CSME1 CSME_any1 POAG_L1 rpscar2_R
## 0.1594425583 0.2222287297 0.1100003720 0.0053894390
## rpscgp2_R2 any_psc1
## 0.0023600633 0.0220580679
lbds[2,2] <- length(as.matrix(coef(fit, id))[1,])There are 38 non-zero vbs.
library(mcca)
res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[2,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[2,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)
res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[2,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[2,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[2,5])## Correct classification rate of test data: 0.9602446
cat("Area under curve of test data:",lbds[2,6])## Area under curve of test data: 0.8457325
lik <- rep(0,200)
res <- predict(fit, trainx)
for (i in 1:200){
pre<- res$response[[i]]
d=t(pre)
lik[i] <- sum(log(d[trainy==0,1]))+ sum(log(d[trainy==1,2]))
}
df <- rep(0,200)
for (i in 1:200){
df[i] <- length(as.matrix(coef(fit, i))[1,])
}
bic <- -1*2*lik+df*log(dim(trainx)[1])
Lambda <- fit$lambda
xlim <- range(Lambda)
plot(Lambda,bic,xlim=xlim,pch=19,type="o",ylab="BIC",col="red")
id <- which(bic==min(bic))
lbds[3,1] <- Lambda[id]
abline(v=lbds[3,1],lwd=3)The best lambda is 0.0125053.
as.matrix(coef(fit, id))[1,]## Intercept gender2 age anti_ht1 anti_chol1
## -1.498544960 -0.118253876 0.001600595 0.103841250 0.477021800
## drugs_others1 chol GFR_EPI smkyn2 ang2
## 0.225972115 -0.166133345 -0.005971500 -0.207175487 -0.946986846
lbds[3,2] <- length(as.matrix(coef(fit, id))[1,])There are 10 non-zero vbs.
library(mcca)
#res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[3,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[3,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)
res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[3,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[3,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[3,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbds[3,6])## Area under curve of test data: 0.8400274
ebic <- bic+df*2*1*log(dim(trainx)[2])
plot(Lambda,ebic,xlim=xlim,pch=19,type="o",ylab="EBIC",col="red")
id <- which(ebic==min(ebic))
lbds[4,1] <- Lambda[id]
abline(v=lbds[4,1],lwd=3)The best lambda is 0.0148558.
as.matrix(coef(fit, id))[1,]## Intercept gender2 anti_ht1 anti_chol1 drugs_others1
## -1.473369292 -0.067060822 0.077682024 0.466963562 0.184767312
## chol GFR_EPI smkyn2 ang2
## -0.146164806 -0.006094271 -0.171624229 -0.904404551
lbds[4,2] <- length(as.matrix(coef(fit, id))[1,])There are 9 non-zero vbs.
library(mcca)
res <- predict(fit, trainx)
pre<- res$response[[id]]
lbds[4,3] <- ccr(y=trainy,d=t(pre),method="prob",k=2)
lbds[4,4] <- hum(y=trainy,d=t(pre),method="prob",k=2)
res <- predict(fit, testx)
pre<- res$response[[id]]
lbds[4,5] <- ccr(y=testy,d=t(pre),method="prob",k=2)
lbds[4,6] <- hum(y=testy,d=t(pre),method="prob",k=2)
cat("Correct classification rate of test data:",lbds[4,5])## Correct classification rate of test data: 0.9633028
cat("Area under curve of test data:",lbds[4,6])## Area under curve of test data: 0.8411684
df <- rbind(lbdl,lbd,lbds)
DT::datatable(df)save(df, file="./df.Rdata")